# encoding=utf-8
"""
@author: xiao nian
@contact: xiaonian030@163.com
@time: 2021-12-12 15:30
"""
from sklearn.model_selection import train_test_split
from config.config import MODEL_CONFIG
import xlrd3 as xlrd


def data():

    # 预料
    text_list = []
    label_list = []

    index = 0

    # 读取 excel
    book = xlrd.open_workbook("data/train.xls")
    table = book.sheet_by_index(0)
    for rx in range(table.nrows):
        index = index + 1
        if index == 1:
            continue
        try:
            cols = table.row_values(rx)
            text_item = str(cols[1]).strip()
            label_item_one = str(cols[2]).strip()
            label_item_two = str(cols[3]).strip()
            text_item = text_item.replace("\n", "")
            label_item_one = label_item_one.replace("\n", "")
            label_item_two = label_item_two.replace("\n", "")
        except:
            text_item = ''
            label_item_one = ''
            label_item_two = ''
        if text_item == '' or (label_item_one == '' and label_item_two == ''):
            continue
        text_list.append(text_item)
        if MODEL_CONFIG['multi_label']:
            # 多分类
            label_item = []
            if label_item_one != '':
                label_item.append(label_item_one)
            if label_item_two != '':
                label_item.append(label_item_two)
            label_list.append(label_item)
        else:
            if label_item_one != '':
                label_list.append(label_item_one)
            else:
                label_list.append(label_item_two)

    # 准备训练、评估、测试数据集
    remain_x, train_x, remain_y, train_y = train_test_split(text_list, label_list, test_size=0.7, random_state=42)
    valid_x, test_x, valid_y, test_y = train_test_split(remain_x, remain_y, test_size=0.5, random_state=42)
    return train_x, train_y, valid_x, valid_y, test_x, test_y
